library(slendr)
library(ggtree)
library(adegenet)
library(StAMPP)
library(vcfR)
library(ggplot2)

check out empirical data

#read in vcf
vcfR <- read.vcfR("~/Desktop/mex.towhees/towhee.75.mac2.unlinked.nomito.vcf.gz")
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 4800
##   header_line: 4801
##   variant count: 2668
##   column count: 41
## 
Meta line 1000 read in.
Meta line 2000 read in.
Meta line 3000 read in.
Meta line 4000 read in.
Meta line 4800 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 2668
##   Character matrix gt cols: 41
##   skip: 0
##   nrows: 2668
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant: 2668
## All variants processed
#read in sample info csv
sample.info<-read.csv("~/Desktop/mex.towhees/sample.locs.csv")
#reorder sampling file to match order of samples in vcf
sample.info<-sample.info[match(colnames(vcfR@gt)[-1], sample.info$ID),]
sample.info$ID == colnames(vcfR@gt)[-1]
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE
sample.info$pop<-c("mac","mac",rep("hyb", times=4),rep("ocai", times=4),rep("mac", times=10),
            rep("hyb", times=4),rep("mac", times=4),rep("ocai", times=4))
#convert to genlight
gen<-vcfR2genlight(vcfR)
#
pop(gen)<-sample.info$pop
sample.div <- stamppNeisD(gen, pop = FALSE)
plot(nj(sample.div), type = "unrooted", cex = 1)

fst<-stamppFst(gen)
fst$Fsts
##             mac        hyb ocai
## mac          NA         NA   NA
## hyb  0.04580035         NA   NA
## ocai 0.07879940 0.03361103   NA
#calculate heterozygosity per sample
mat<-extract.gt(vcfR)
sample.info$het<-sample.info$sample.loc
#write loop to calculate heterozygosity
for (i in 1:ncol(mat)){
  sample.info$het[i]<-sum(mat[,i][!is.na(mat[,i])] == "0/1")/sum(!is.na(mat[,i]))
}
sample.info$het<-as.numeric(as.character(sample.info$het))
#plot heterozygosity violin plots
ggplot(sample.info, aes(x=pop, y=het)) + 
  #geom_violin(trim = FALSE)+
  geom_dotplot(binaxis='y', stackdir='center', dotsize = 2, alpha=.75,aes(fill=sample.loc))+
  theme_classic()+
  labs(x="sampling locality",y="heterozygosity")
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

This is the empirical pattern of relatedness (unrooted neighbor joining tree), divergence (Fst), and diversity (hetrozygosity) that we will try to match via simulation.

setup first model

#define each population
mac <- population("mac", time = 1, N = 10000)
ocai <- population("ocai", time = 50, N = 10000, parent = mac)
hyb <- population("hyb", time = 800, N = 10000, parent = ocai)

#add in the gene flow edge
gf <- gene_flow(from = ocai, to = hyb, start = 801, end = 1000, 0.75)
gf2 <- gene_flow(from = mac, to = hyb, start = 801, end = 1000, 0.75)
gf3 <- gene_flow(from = hyb, to = mac, start = 801, end = 1000, 0.75)
gf4 <- gene_flow(from = hyb, to = ocai, start = 801, end = 1000, 0.75)

#compile the model
model <- compile_model(populations = list(mac,ocai,hyb),
                       gene_flow = list(gf,gf2,gf3,gf4),
                       generation_time = 1,
                       sim_length = 1000,
                       path = "~/Downloads/slim.examples/tow",
                       overwrite = TRUE, force = TRUE)

#plot the model to make sure you set it up correctly
plot_model(model, sizes = TRUE, proportions = TRUE)

#schedule sampling
present_samples <- schedule_sampling(model, times = 1000, list(mac, 16), list(hyb, 8), list(ocai, 8), strict = TRUE)
present_samples
## # A tibble: 3 × 7
##    time pop       n y_orig x_orig y     x    
##   <int> <chr> <int> <lgl>  <lgl>  <lgl> <lgl>
## 1  1000 mac      16 NA     NA     NA    NA   
## 2  1000 hyb       8 NA     NA     NA    NA   
## 3  1000 ocai      8 NA     NA     NA    NA

sim first model

#sim the model in msprime
msprime(model, sequence_length = 1e7, recombination_rate = 1e-8,
        sampling = present_samples, random_seed = 314)
#check out resulting trees
ts_file <- file.path(model$path, "output_msprime.trees")
file.exists(ts_file)
## [1] TRUE
#load in tree sequence
ts <- ts_load(model)
ts
## ╔═══════════════════════════╗
## ║TreeSequence               ║
## ╠═══════════════╤═══════════╣
## ║Trees          │      18062║
## ╟───────────────┼───────────╢
## ║Sequence Length│   10000000║
## ╟───────────────┼───────────╢
## ║Time Units     │generations║
## ╟───────────────┼───────────╢
## ║Sample Nodes   │         64║
## ╟───────────────┼───────────╢
## ║Total Size     │    2.7 MiB║
## ╚═══════════════╧═══════════╝
## ╔═══════════╤═════╤═════════╤════════════╗
## ║Table      │Rows │Size     │Has Metadata║
## ╠═══════════╪═════╪═════════╪════════════╣
## ║Edges      │63884│  1.9 MiB│          No║
## ╟───────────┼─────┼─────────┼────────────╢
## ║Individuals│   32│920 Bytes│          No║
## ╟───────────┼─────┼─────────┼────────────╢
## ║Migrations │    0│  8 Bytes│          No║
## ╟───────────┼─────┼─────────┼────────────╢
## ║Mutations  │    0│ 16 Bytes│          No║
## ╟───────────┼─────┼─────────┼────────────╢
## ║Nodes      │11287│308.6 KiB│          No║
## ╟───────────┼─────┼─────────┼────────────╢
## ║Populations│    3│301 Bytes│         Yes║
## ╟───────────┼─────┼─────────┼────────────╢
## ║Provenances│    1│  2.7 KiB│          No║
## ╟───────────┼─────┼─────────┼────────────╢
## ║Sites      │    0│ 16 Bytes│          No║
## ╚═══════════╧═════╧═════════╧════════════╝
ts_coalesced(ts) #confirm coalescence
## [1] TRUE
#add mutations to simulation
ts <- ts_mutate(ts, mutation_rate = 1e-8, random_seed = 3141)
ts
## ╔═══════════════════════════╗
## ║TreeSequence               ║
## ╠═══════════════╤═══════════╣
## ║Trees          │      18062║
## ╟───────────────┼───────────╢
## ║Sequence Length│   10000000║
## ╟───────────────┼───────────╢
## ║Time Units     │generations║
## ╟───────────────┼───────────╢
## ║Sample Nodes   │         64║
## ╟───────────────┼───────────╢
## ║Total Size     │    4.0 MiB║
## ╚═══════════════╧═══════════╝
## ╔═══════════╤═════╤═════════╤════════════╗
## ║Table      │Rows │Size     │Has Metadata║
## ╠═══════════╪═════╪═════════╪════════════╣
## ║Edges      │63884│  1.9 MiB│          No║
## ╟───────────┼─────┼─────────┼────────────╢
## ║Individuals│   32│920 Bytes│          No║
## ╟───────────┼─────┼─────────┼────────────╢
## ║Migrations │    0│  8 Bytes│          No║
## ╟───────────┼─────┼─────────┼────────────╢
## ║Mutations  │20784│751.0 KiB│          No║
## ╟───────────┼─────┼─────────┼────────────╢
## ║Nodes      │11287│308.6 KiB│          No║
## ╟───────────┼─────┼─────────┼────────────╢
## ║Populations│    3│301 Bytes│         Yes║
## ╟───────────┼─────┼─────────┼────────────╢
## ║Provenances│    2│  3.5 KiB│          No║
## ╟───────────┼─────┼─────────┼────────────╢
## ║Sites      │20755│506.7 KiB│          No║
## ╚═══════════╧═════╧═════════╧════════════╝
# extract a tree in the tree sequence
tree <- ts_phylo(ts, 280)
## Starting checking the validity of tree...
## Found number of tips: n = 64 
## Found number of nodes: m = 63 
## Done.
tree
## 
## Phylogenetic tree with 64 tips and 63 internal nodes.
## 
## Tip labels:
##   63 (ocai_8), 62 (ocai_8), 61 (ocai_7), 60 (ocai_7), 59 (ocai_6), 58 (ocai_6), ...
## Node labels:
##   8960, 65, 66, 113, 205, 227, ...
## 
## Rooted; includes branch lengths.
ggtree(tree) +
  geom_point2(aes(subset = !isTip)) + # points for internal nodes
  geom_tiplab() + # sample labels for tips
  hexpand(0.1)    # make more space for the tip labels

#write out vcf
ts_vcf(ts, path = "~/Downloads/slim.examples/output.vcf.gz")
#read in vcf
vcfR <- read.vcfR("~/Downloads/slim.examples/output.vcf.gz")
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 5
##   header_line: 6
##   variant count: 20755
##   column count: 41
## 
Meta line 5 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 20755
##   Character matrix gt cols: 41
##   skip: 0
##   nrows: 20755
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant: 20755
## All variants processed
#convert to genlight
gen<-vcfR2genlight(vcfR)
## Warning in vcfR2genlight(vcfR): Found 23 loci with more than two alleles.
## Objects of class genlight only support loci with two alleles.
## 23 loci will be omitted from the genlight object.
#
pop(gen)<-gen@ind.names
sample.div <- stamppNeisD(gen, pop = FALSE)
plot(nj(sample.div), type = "unrooted", cex = 1)

sample.sets<-list(mac = c("mac_1","mac_2","mac_3","mac_4","mac_5","mac_6","mac_7","mac_8"),
                   ocai= c("ocai_1","ocai_2","ocai_3","ocai_4"),
                   hyb = c("hyb_1","hyb_2","hyb_3","hyb_4"))
ts_fst(ts, sample.sets)
## # A tibble: 3 × 3
##   x     y           Fst
##   <chr> <chr>     <dbl>
## 1 mac   ocai  -0.000609
## 2 mac   hyb    0.00419 
## 3 ocai  hyb    0.0104
#calculate heterozygosity per sample
mat<-extract.gt(vcfR)
sample.info$sim.het<-c(rep("hyb", times=8),rep("mac", times=16),rep("ocai", times=8))
sample.info$sim.pop<-c(rep("hyb", times=8),rep("mac", times=16),rep("ocai", times=8))
#write loop to calculate heterozygosity
for (i in 1:ncol(mat)){
  sample.info$sim.het[i]<-(sum(mat[,i][!is.na(mat[,i])] == "0|1")+sum(mat[,i][!is.na(mat[,i])] == "1|0"))/sum(!is.na(mat[,i]))
}
sample.info$sim.het<-as.numeric(as.character(sample.info$sim.het))
#plot heterozygosity violin plots
ggplot(sample.info, aes(x=sim.pop, y=sim.het)) + 
  #geom_violin(trim = FALSE)+
  geom_dotplot(binaxis='y', stackdir='center', dotsize = 2, alpha=.75,aes(fill=sample.loc))+
  theme_classic()+
  labs(x="sampling locality",y="heterozygosity")
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

sim second model

#define each population
mac <- population("mac", time = 1, N = 10000)
ocai <- population("ocai", time = 50, N = 10000, parent = mac)
hyb <- population("hyb", time = 4000, N = 10000, parent = ocai)

#add in the gene flow edge
#add in the gene flow edge
gf <- gene_flow(from = ocai, to = hyb, start = 4001, end = 5000, 0.75)
gf2 <- gene_flow(from = mac, to = hyb, start = 4001, end = 5000, 0.75)
gf3 <- gene_flow(from = hyb, to = mac, start = 4001, end = 5000, 0.75)
gf4 <- gene_flow(from = hyb, to = ocai, start = 4001, end = 5000, 0.75)

#compile the model
model <- compile_model(populations = list(mac,ocai,hyb),
                       gene_flow = list(gf,gf2,gf3,gf4),
                       generation_time = 1,
                       sim_length = 5000,
                       path = "~/Downloads/slim.examples/tow2",
                       overwrite = TRUE, force = TRUE)

#plot the model to make sure you set it up correctly
plot_model(model, sizes = TRUE, proportions = TRUE)

#schedule sampling
present_samples <- schedule_sampling(model, times = 5000, list(mac, 16), list(hyb, 8), list(ocai, 8), strict = TRUE)
present_samples
## # A tibble: 3 × 7
##    time pop       n y_orig x_orig y     x    
##   <int> <chr> <int> <lgl>  <lgl>  <lgl> <lgl>
## 1  5000 mac      16 NA     NA     NA    NA   
## 2  5000 hyb       8 NA     NA     NA    NA   
## 3  5000 ocai      8 NA     NA     NA    NA
#sim the model in msprime
msprime(model, sequence_length = 1e7, recombination_rate = 1e-8,
        sampling = present_samples, random_seed = 315)

#load in tree sequence
ts <- ts_load(model)

#add mutations to simulation
ts <- ts_mutate(ts, mutation_rate = 1e-8, random_seed = 3141)

#write out vcf
ts_vcf(ts, path = "~/Downloads/slim.examples/output.2.vcf.gz")
#read in vcf
vcfR <- read.vcfR("~/Downloads/slim.examples/output.2.vcf.gz")
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 5
##   header_line: 6
##   variant count: 26646
##   column count: 41
## 
Meta line 5 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 26646
##   Character matrix gt cols: 41
##   skip: 0
##   nrows: 26646
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant 23000
Processed variant 24000
Processed variant 25000
Processed variant 26000
Processed variant: 26646
## All variants processed
#convert to genlight
gen<-vcfR2genlight(vcfR)
## Warning in vcfR2genlight(vcfR): Found 31 loci with more than two alleles.
## Objects of class genlight only support loci with two alleles.
## 31 loci will be omitted from the genlight object.
#
pop(gen)<-gen@ind.names
sample.div <- stamppNeisD(gen, pop = FALSE)
plot(nj(sample.div), type = "unrooted", cex = 1)

ts_fst(ts, sample.sets)
## # A tibble: 3 × 3
##   x     y        Fst
##   <chr> <chr>  <dbl>
## 1 mac   ocai  0.0214
## 2 mac   hyb   0.0166
## 3 ocai  hyb   0.0121
#calculate heterozygosity per sample
mat<-extract.gt(vcfR)
sample.info$sim.het<-c(rep("hyb", times=8),rep("mac", times=16),rep("ocai", times=8))
sample.info$sim.pop<-c(rep("hyb", times=8),rep("mac", times=16),rep("ocai", times=8))
#write loop to calculate heterozygosity
for (i in 1:ncol(mat)){
  sample.info$sim.het[i]<-(sum(mat[,i][!is.na(mat[,i])] == "0|1")+sum(mat[,i][!is.na(mat[,i])] == "1|0"))/sum(!is.na(mat[,i]))
}
sample.info$sim.het<-as.numeric(as.character(sample.info$sim.het))
#plot heterozygosity violin plots
ggplot(sample.info, aes(x=sim.pop, y=sim.het)) + 
  #geom_violin(trim = FALSE)+
  geom_dotplot(binaxis='y', stackdir='center', dotsize = 2, alpha=.75,aes(fill=sample.loc))+
  theme_classic()+
  labs(x="sampling locality",y="heterozygosity")
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

sim third model

#define each population
mac <- population("mac", time = 1, N = 10000)
ocai <- population("ocai", time = 50, N = 10000, parent = mac)
hyb <- population("hyb", time = 4000, N = 10000, parent = ocai)

#add in the gene flow edge
#add in the gene flow edge
gf <- gene_flow(from = ocai, to = hyb, start = 4001, end = 5000, 0.5)
gf2 <- gene_flow(from = mac, to = hyb, start = 4001, end = 5000, 0.5)
gf3 <- gene_flow(from = hyb, to = mac, start = 4001, end = 5000, 0.5)
gf4 <- gene_flow(from = hyb, to = ocai, start =4001, end = 5000, 0.5)

#compile the model
model <- compile_model(populations = list(mac,ocai,hyb),
                       gene_flow = list(gf,gf2,gf3,gf4),
                       generation_time = 1,
                       sim_length = 5000,
                       path = "~/Downloads/slim.examples/tow3",
                       overwrite = TRUE, force = TRUE)

#plot the model to make sure you set it up correctly
plot_model(model, sizes = TRUE, proportions = TRUE)

#sim the model in msprime
msprime(model, sequence_length = 1e7, recombination_rate = 1e-8,
        sampling = present_samples, random_seed = 315)

#load in tree sequence
ts <- ts_load(model)

#add mutations to simulation
ts <- ts_mutate(ts, mutation_rate = 1e-8, random_seed = 3141)

#write out vcf
ts_vcf(ts, path = "~/Downloads/slim.examples/output.3.vcf.gz")
#read in vcf
vcfR <- read.vcfR("~/Downloads/slim.examples/output.3.vcf.gz")
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 5
##   header_line: 6
##   variant count: 26372
##   column count: 41
## 
Meta line 5 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 26372
##   Character matrix gt cols: 41
##   skip: 0
##   nrows: 26372
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant 23000
Processed variant 24000
Processed variant 25000
Processed variant 26000
Processed variant: 26372
## All variants processed
#convert to genlight
gen<-vcfR2genlight(vcfR)
## Warning in vcfR2genlight(vcfR): Found 25 loci with more than two alleles.
## Objects of class genlight only support loci with two alleles.
## 25 loci will be omitted from the genlight object.
#
pop(gen)<-gen@ind.names
sample.div <- stamppNeisD(gen, pop = FALSE)
plot(nj(sample.div), type = "unrooted", cex = 1)

ts_fst(ts, sample.sets)
## # A tibble: 3 × 3
##   x     y        Fst
##   <chr> <chr>  <dbl>
## 1 mac   ocai  0.0510
## 2 mac   hyb   0.0359
## 3 ocai  hyb   0.0157
#calculate heterozygosity per sample
mat<-extract.gt(vcfR)
sample.info$sim.het<-c(rep("hyb", times=8),rep("mac", times=16),rep("ocai", times=8))
sample.info$sim.pop<-c(rep("hyb", times=8),rep("mac", times=16),rep("ocai", times=8))
#write loop to calculate heterozygosity
for (i in 1:ncol(mat)){
  sample.info$sim.het[i]<-(sum(mat[,i][!is.na(mat[,i])] == "0|1")+sum(mat[,i][!is.na(mat[,i])] == "1|0"))/sum(!is.na(mat[,i]))
}
sample.info$sim.het<-as.numeric(as.character(sample.info$sim.het))
#plot heterozygosity violin plots
ggplot(sample.info, aes(x=sim.pop, y=sim.het)) + 
  #geom_violin(trim = FALSE)+
  geom_dotplot(binaxis='y', stackdir='center', dotsize = 2, alpha=.75,aes(fill=sample.loc))+
  theme_classic()+
  labs(x="sampling locality",y="heterozygosity")
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

sim fourth model

#define each population
mac <- population("mac", time = 1, N = 10000)
ocai <- population("ocai", time = 50, N = 10000, parent = mac)
hyb <- population("hyb", time = 3500, N = 10000, parent = ocai)

#add in the gene flow edge
#add in the gene flow edge
gf <- gene_flow(from = ocai, to = hyb, start = 3501, end = 4500, 0.75)
gf2 <- gene_flow(from = mac, to = hyb, start = 3501, end = 4500, 0.75)
gf3 <- gene_flow(from = hyb, to = mac, start = 3501, end = 4500, 0.75)
gf4 <- gene_flow(from = hyb, to = ocai, start =3501, end = 4500, 0.75)

#compile the model
model <- compile_model(populations = list(mac,ocai,hyb),
                       gene_flow = list(gf,gf2,gf3,gf4),
                       generation_time = 1,
                       sim_length = 5000,
                       path = "~/Downloads/slim.examples/tow4",
                       overwrite = TRUE, force = TRUE)

#plot the model to make sure you set it up correctly
plot_model(model, sizes = TRUE, proportions = TRUE)

#sim the model in msprime
msprime(model, sequence_length = 1e7, recombination_rate = 1e-8,
        sampling = present_samples, random_seed = 315)

#load in tree sequence
ts <- ts_load(model)

#add mutations to simulation
ts <- ts_mutate(ts, mutation_rate = 1e-8, random_seed = 3141)

#write out vcf
ts_vcf(ts, path = "~/Downloads/slim.examples/output.4.vcf.gz")
#read in vcf
vcfR <- read.vcfR("~/Downloads/slim.examples/output.4.vcf.gz")
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 5
##   header_line: 6
##   variant count: 26391
##   column count: 41
## 
Meta line 5 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 26391
##   Character matrix gt cols: 41
##   skip: 0
##   nrows: 26391
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant 23000
Processed variant 24000
Processed variant 25000
Processed variant 26000
Processed variant: 26391
## All variants processed
#convert to genlight
gen<-vcfR2genlight(vcfR)
## Warning in vcfR2genlight(vcfR): Found 24 loci with more than two alleles.
## Objects of class genlight only support loci with two alleles.
## 24 loci will be omitted from the genlight object.
#
pop(gen)<-gen@ind.names
sample.div <- stamppNeisD(gen, pop = FALSE)
plot(nj(sample.div), type = "unrooted", cex = 1)

ts_fst(ts, sample.sets)
## # A tibble: 3 × 3
##   x     y        Fst
##   <chr> <chr>  <dbl>
## 1 mac   ocai  0.0425
## 2 mac   hyb   0.0223
## 3 ocai  hyb   0.0268
#calculate heterozygosity per sample
mat<-extract.gt(vcfR)
sample.info$sim.het<-c(rep("hyb", times=8),rep("mac", times=16),rep("ocai", times=8))
sample.info$sim.pop<-c(rep("hyb", times=8),rep("mac", times=16),rep("ocai", times=8))
#write loop to calculate heterozygosity
for (i in 1:ncol(mat)){
  sample.info$sim.het[i]<-(sum(mat[,i][!is.na(mat[,i])] == "0|1")+sum(mat[,i][!is.na(mat[,i])] == "1|0"))/sum(!is.na(mat[,i]))
}
sample.info$sim.het<-as.numeric(as.character(sample.info$sim.het))
#plot heterozygosity violin plots
ggplot(sample.info, aes(x=sim.pop, y=sim.het)) + 
  #geom_violin(trim = FALSE)+
  geom_dotplot(binaxis='y', stackdir='center', dotsize = 2, alpha=.75,aes(fill=sample.loc))+
  theme_classic()+
  labs(x="sampling locality",y="heterozygosity")
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

sim fifth model

#define each population
mac <- population("mac", time = 1, N = 10000)
ocai <- population("ocai", time = 50, N = 10000, parent = mac)
hyb <- population("hyb", time = 10000, N = 10000, parent = ocai)

#add in the gene flow edge
#add in the gene flow edge
gf <- gene_flow(from = ocai, to = hyb, start = 10001, end = 15000, 0.5)
gf2 <- gene_flow(from = mac, to = hyb, start = 10001, end = 15000, 0.5)
gf3 <- gene_flow(from = hyb, to = mac, start = 10001, end = 15000, 0.5)
gf4 <- gene_flow(from = hyb, to = ocai, start =10001, end = 15000, 0.5)

#compile the model
model <- compile_model(populations = list(mac,ocai,hyb),
                       gene_flow = list(gf,gf2,gf3,gf4),
                       generation_time = 1,
                       sim_length = 20000,
                       path = "~/Downloads/slim.examples/tow5",
                       overwrite = TRUE, force = TRUE)

#plot the model to make sure you set it up correctly
plot_model(model, sizes = TRUE, proportions = TRUE)

#schedule sampling
present_samples <- schedule_sampling(model, times = 20000, list(mac, 16), list(hyb, 8), list(ocai, 8), strict = TRUE)
present_samples
## # A tibble: 3 × 7
##    time pop       n y_orig x_orig y     x    
##   <int> <chr> <int> <lgl>  <lgl>  <lgl> <lgl>
## 1 20000 mac      16 NA     NA     NA    NA   
## 2 20000 hyb       8 NA     NA     NA    NA   
## 3 20000 ocai      8 NA     NA     NA    NA
#sim the model in msprime
msprime(model, sequence_length = 1e7, recombination_rate = 1e-8,
        sampling = present_samples, random_seed = 315)

#load in tree sequence
ts <- ts_load(model)

#add mutations to simulation
ts <- ts_mutate(ts, mutation_rate = 1e-8, random_seed = 3141)

#write out vcf
ts_vcf(ts, path = "~/Downloads/slim.examples/output.5.vcf.gz")
#read in vcf
vcfR <- read.vcfR("~/Downloads/slim.examples/output.5.vcf.gz")
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 5
##   header_line: 6
##   variant count: 37447
##   column count: 41
## 
Meta line 5 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 37447
##   Character matrix gt cols: 41
##   skip: 0
##   nrows: 37447
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant 23000
Processed variant 24000
Processed variant 25000
Processed variant 26000
Processed variant 27000
Processed variant 28000
Processed variant 29000
Processed variant 30000
Processed variant 31000
Processed variant 32000
Processed variant 33000
Processed variant 34000
Processed variant 35000
Processed variant 36000
Processed variant 37000
Processed variant: 37447
## All variants processed
#convert to genlight
gen<-vcfR2genlight(vcfR)
## Warning in vcfR2genlight(vcfR): Found 41 loci with more than two alleles.
## Objects of class genlight only support loci with two alleles.
## 41 loci will be omitted from the genlight object.
#
pop(gen)<-gen@ind.names
sample.div <- stamppNeisD(gen, pop = FALSE)
plot(nj(sample.div), type = "unrooted", cex = 1)

ts_fst(ts, sample.sets)
## # A tibble: 3 × 3
##   x     y       Fst
##   <chr> <chr> <dbl>
## 1 mac   ocai  0.232
## 2 mac   hyb   0.181
## 3 ocai  hyb   0.157
#calculate heterozygosity per sample
mat<-extract.gt(vcfR)
sample.info$sim.het<-c(rep("hyb", times=8),rep("mac", times=16),rep("ocai", times=8))
sample.info$sim.pop<-c(rep("hyb", times=8),rep("mac", times=16),rep("ocai", times=8))
#write loop to calculate heterozygosity
for (i in 1:ncol(mat)){
  sample.info$sim.het[i]<-(sum(mat[,i][!is.na(mat[,i])] == "0|1")+sum(mat[,i][!is.na(mat[,i])] == "1|0"))/sum(!is.na(mat[,i]))
}
sample.info$sim.het<-as.numeric(as.character(sample.info$sim.het))
#plot heterozygosity violin plots
ggplot(sample.info, aes(x=sim.pop, y=sim.het)) + 
  #geom_violin(trim = FALSE)+
  geom_dotplot(binaxis='y', stackdir='center', dotsize = 2, alpha=.75,aes(fill=sample.loc))+
  theme_classic()+
  labs(x="sampling locality",y="heterozygosity")
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

sim sixth model

#define each population
mac <- population("mac", time = 1, N = 25000)
ocai <- population("ocai", time = 50, N = 25000, parent = mac)
hyb <- population("hyb", time = 10000, N = 25000, parent = ocai)

#add in the gene flow edge
#add in the gene flow edge
gf <- gene_flow(from = ocai, to = hyb, start = 10001, end = 15000, 0.8)
gf2 <- gene_flow(from = mac, to = hyb, start = 10001, end = 15000, 0.8)
gf3 <- gene_flow(from = hyb, to = mac, start = 10001, end = 15000, 0.8)
gf4 <- gene_flow(from = hyb, to = ocai, start =10001, end = 15000, 0.8)
gf.x <- gene_flow(from = ocai, to = hyb, start = 15001, end = 20000, 0.2)
gf2.x <- gene_flow(from = mac, to = hyb, start = 15001, end = 20000, 0.2)
gf3.x <- gene_flow(from = hyb, to = mac, start = 15001, end = 20000, 0.2)
gf4.x <- gene_flow(from = hyb, to = ocai, start =15001, end = 20000, 0.2)

#compile the model
model <- compile_model(populations = list(mac,ocai,hyb),
                       gene_flow = list(gf,gf2,gf3,gf4,gf.x,gf2.x,gf3.x,gf4.x),
                       generation_time = 1,
                       sim_length = 20000,
                       path = "~/Downloads/slim.examples/tow6",
                       overwrite = TRUE, force = TRUE)

#plot the model to make sure you set it up correctly
plot_model(model, sizes = TRUE, proportions = TRUE)

#schedule sampling
present_samples <- schedule_sampling(model, times = 20000, list(mac, 16), list(hyb, 8), list(ocai, 8), strict = TRUE)
present_samples
## # A tibble: 3 × 7
##    time pop       n y_orig x_orig y     x    
##   <int> <chr> <int> <lgl>  <lgl>  <lgl> <lgl>
## 1 20000 mac      16 NA     NA     NA    NA   
## 2 20000 hyb       8 NA     NA     NA    NA   
## 3 20000 ocai      8 NA     NA     NA    NA
#sim the model in msprime
msprime(model, sequence_length = 1e7, recombination_rate = 1e-8,
        sampling = present_samples, random_seed = 315)

#load in tree sequence
ts <- ts_load(model)

#add mutations to simulation
ts <- ts_mutate(ts, mutation_rate = 1e-8, random_seed = 3141)

#write out vcf
ts_vcf(ts, path = "~/Downloads/slim.examples/output.6.vcf.gz")
#read in vcf
vcfR <- read.vcfR("~/Downloads/slim.examples/output.6.vcf.gz")
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 5
##   header_line: 6
##   variant count: 75885
##   column count: 41
## 
Meta line 5 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 75885
##   Character matrix gt cols: 41
##   skip: 0
##   nrows: 75885
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant 23000
Processed variant 24000
Processed variant 25000
Processed variant 26000
Processed variant 27000
Processed variant 28000
Processed variant 29000
Processed variant 30000
Processed variant 31000
Processed variant 32000
Processed variant 33000
Processed variant 34000
Processed variant 35000
Processed variant 36000
Processed variant 37000
Processed variant 38000
Processed variant 39000
Processed variant 40000
Processed variant 41000
Processed variant 42000
Processed variant 43000
Processed variant 44000
Processed variant 45000
Processed variant 46000
Processed variant 47000
Processed variant 48000
Processed variant 49000
Processed variant 50000
Processed variant 51000
Processed variant 52000
Processed variant 53000
Processed variant 54000
Processed variant 55000
Processed variant 56000
Processed variant 57000
Processed variant 58000
Processed variant 59000
Processed variant 60000
Processed variant 61000
Processed variant 62000
Processed variant 63000
Processed variant 64000
Processed variant 65000
Processed variant 66000
Processed variant 67000
Processed variant 68000
Processed variant 69000
Processed variant 70000
Processed variant 71000
Processed variant 72000
Processed variant 73000
Processed variant 74000
Processed variant 75000
Processed variant: 75885
## All variants processed
#convert to genlight
gen<-vcfR2genlight(vcfR)
## Warning in vcfR2genlight(vcfR): Found 198 loci with more than two alleles.
## Objects of class genlight only support loci with two alleles.
## 198 loci will be omitted from the genlight object.
#
pop(gen)<-gen@ind.names
sample.div <- stamppNeisD(gen, pop = FALSE)
plot(nj(sample.div), type = "unrooted", cex = 1)

ts_fst(ts, sample.sets)
## # A tibble: 3 × 3
##   x     y        Fst
##   <chr> <chr>  <dbl>
## 1 mac   ocai  0.0717
## 2 mac   hyb   0.0476
## 3 ocai  hyb   0.0381
#calculate heterozygosity per sample
mat<-extract.gt(vcfR)
sample.info$sim.het<-c(rep("hyb", times=8),rep("mac", times=16),rep("ocai", times=8))
sample.info$sim.pop<-c(rep("hyb", times=8),rep("mac", times=16),rep("ocai", times=8))
#write loop to calculate heterozygosity
for (i in 1:ncol(mat)){
  sample.info$sim.het[i]<-(sum(mat[,i][!is.na(mat[,i])] == "0|1")+sum(mat[,i][!is.na(mat[,i])] == "1|0"))/sum(!is.na(mat[,i]))
}
sample.info$sim.het<-as.numeric(as.character(sample.info$sim.het))
#plot heterozygosity violin plots
ggplot(sample.info, aes(x=sim.pop, y=sim.het)) + 
  #geom_violin(trim = FALSE)+
  geom_dotplot(binaxis='y', stackdir='center', dotsize = 2, alpha=.75,aes(fill=sample.loc))+
  theme_classic()+
  labs(x="sampling locality",y="heterozygosity")
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

sim seventh model

#define each population
mac <- population("mac", time = 1, N = 25000)
ocai <- population("ocai", time = 50, N = 25000, parent = mac)
hyb <- population("hyb", time = 10000, N = 25000, parent = ocai)

#add in the gene flow edge
#add in the gene flow edge
gf <- gene_flow(from = ocai, to = hyb, start = 10001, end = 15000, 0.2)
gf2 <- gene_flow(from = mac, to = hyb, start = 10001, end = 15000, 0.2)
gf3 <- gene_flow(from = hyb, to = mac, start = 10001, end = 15000, 0.2)
gf4 <- gene_flow(from = hyb, to = ocai, start =10001, end = 15000, 0.2)
gf.x <- gene_flow(from = ocai, to = hyb, start = 15001, end = 20000, 0.8)
gf2.x <- gene_flow(from = mac, to = hyb, start = 15001, end = 20000, 0.8)
gf3.x <- gene_flow(from = hyb, to = mac, start = 15001, end = 20000, 0.8)
gf4.x <- gene_flow(from = hyb, to = ocai, start =15001, end = 20000, 0.8)

#compile the model
model <- compile_model(populations = list(mac,ocai,hyb),
                       gene_flow = list(gf,gf2,gf3,gf4,gf.x,gf2.x,gf3.x,gf4.x),
                       generation_time = 1,
                       sim_length = 20000,
                       path = "~/Downloads/slim.examples/tow7",
                       overwrite = TRUE, force = TRUE)

#plot the model to make sure you set it up correctly
plot_model(model, sizes = TRUE, proportions = TRUE)

#schedule sampling
present_samples <- schedule_sampling(model, times = 20000, list(mac, 16), list(hyb, 8), list(ocai, 8), strict = TRUE)
present_samples
## # A tibble: 3 × 7
##    time pop       n y_orig x_orig y     x    
##   <int> <chr> <int> <lgl>  <lgl>  <lgl> <lgl>
## 1 20000 mac      16 NA     NA     NA    NA   
## 2 20000 hyb       8 NA     NA     NA    NA   
## 3 20000 ocai      8 NA     NA     NA    NA
#sim the model in msprime
msprime(model, sequence_length = 1e7, recombination_rate = 1e-8,
        sampling = present_samples, random_seed = 315)

#load in tree sequence
ts <- ts_load(model)

#add mutations to simulation
ts <- ts_mutate(ts, mutation_rate = 1e-8, random_seed = 3141)

#write out vcf
ts_vcf(ts, path = "~/Downloads/slim.examples/output.7.vcf.gz")
#read in vcf
vcfR <- read.vcfR("~/Downloads/slim.examples/output.7.vcf.gz")
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 5
##   header_line: 6
##   variant count: 75827
##   column count: 41
## 
Meta line 5 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 75827
##   Character matrix gt cols: 41
##   skip: 0
##   nrows: 75827
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant 23000
Processed variant 24000
Processed variant 25000
Processed variant 26000
Processed variant 27000
Processed variant 28000
Processed variant 29000
Processed variant 30000
Processed variant 31000
Processed variant 32000
Processed variant 33000
Processed variant 34000
Processed variant 35000
Processed variant 36000
Processed variant 37000
Processed variant 38000
Processed variant 39000
Processed variant 40000
Processed variant 41000
Processed variant 42000
Processed variant 43000
Processed variant 44000
Processed variant 45000
Processed variant 46000
Processed variant 47000
Processed variant 48000
Processed variant 49000
Processed variant 50000
Processed variant 51000
Processed variant 52000
Processed variant 53000
Processed variant 54000
Processed variant 55000
Processed variant 56000
Processed variant 57000
Processed variant 58000
Processed variant 59000
Processed variant 60000
Processed variant 61000
Processed variant 62000
Processed variant 63000
Processed variant 64000
Processed variant 65000
Processed variant 66000
Processed variant 67000
Processed variant 68000
Processed variant 69000
Processed variant 70000
Processed variant 71000
Processed variant 72000
Processed variant 73000
Processed variant 74000
Processed variant 75000
Processed variant: 75827
## All variants processed
#convert to genlight
gen<-vcfR2genlight(vcfR)
## Warning in vcfR2genlight(vcfR): Found 197 loci with more than two alleles.
## Objects of class genlight only support loci with two alleles.
## 197 loci will be omitted from the genlight object.
#
pop(gen)<-gen@ind.names
sample.div <- stamppNeisD(gen, pop = FALSE)
plot(nj(sample.div), type = "unrooted", cex = 1)

ts_fst(ts, sample.sets)
## # A tibble: 3 × 3
##   x     y        Fst
##   <chr> <chr>  <dbl>
## 1 mac   ocai  0.0422
## 2 mac   hyb   0.0228
## 3 ocai  hyb   0.0156
#calculate heterozygosity per sample
mat<-extract.gt(vcfR)
sample.info$sim.het<-c(rep("hyb", times=8),rep("mac", times=16),rep("ocai", times=8))
sample.info$sim.pop<-c(rep("hyb", times=8),rep("mac", times=16),rep("ocai", times=8))
#write loop to calculate heterozygosity
for (i in 1:ncol(mat)){
  sample.info$sim.het[i]<-(sum(mat[,i][!is.na(mat[,i])] == "0|1")+sum(mat[,i][!is.na(mat[,i])] == "1|0"))/sum(!is.na(mat[,i]))
}
sample.info$sim.het<-as.numeric(as.character(sample.info$sim.het))
#plot heterozygosity violin plots
ggplot(sample.info, aes(x=sim.pop, y=sim.het)) + 
  #geom_violin(trim = FALSE)+
  geom_dotplot(binaxis='y', stackdir='center', dotsize = 2, alpha=.75,aes(fill=sample.loc))+
  theme_classic()+
  labs(x="sampling locality",y="heterozygosity")
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

sim eighth model

#define each population
mac <- population("mac", time = 1, N = 25000)
ocai <- population("ocai", time = 50, N = 25000, parent = mac)
hyb <- population("hyb", time = 5000, N = 25000, parent = ocai)

#add in the gene flow edge
#add in the gene flow edge
gf <- gene_flow(from = ocai, to = hyb, start = 5001, end = 15000, 0.2)
gf2 <- gene_flow(from = mac, to = hyb, start = 5001, end = 15000, 0.2)
gf3 <- gene_flow(from = hyb, to = mac, start = 5001, end = 15000, 0.2)
gf4 <- gene_flow(from = hyb, to = ocai, start =5001, end = 15000, 0.2)
gf.x <- gene_flow(from = ocai, to = hyb, start = 15001, end = 20000, 0.8)
gf2.x <- gene_flow(from = mac, to = hyb, start = 15001, end = 20000, 0.8)
gf3.x <- gene_flow(from = hyb, to = mac, start = 15001, end = 20000, 0.8)
gf4.x <- gene_flow(from = hyb, to = ocai, start =15001, end = 20000, 0.8)

#compile the model
model <- compile_model(populations = list(mac,ocai,hyb),
                       gene_flow = list(gf,gf2,gf3,gf4,gf.x,gf2.x,gf3.x,gf4.x),
                       generation_time = 1,
                       sim_length = 20000,
                       path = "~/Downloads/slim.examples/tow8",
                       overwrite = TRUE, force = TRUE)

#plot the model to make sure you set it up correctly
plot_model(model, sizes = TRUE, proportions = TRUE)

#schedule sampling
present_samples <- schedule_sampling(model, times = 20000, list(mac, 16), list(hyb, 8), list(ocai, 8), strict = TRUE)
present_samples
## # A tibble: 3 × 7
##    time pop       n y_orig x_orig y     x    
##   <int> <chr> <int> <lgl>  <lgl>  <lgl> <lgl>
## 1 20000 mac      16 NA     NA     NA    NA   
## 2 20000 hyb       8 NA     NA     NA    NA   
## 3 20000 ocai      8 NA     NA     NA    NA
#sim the model in msprime
msprime(model, sequence_length = 1e7, recombination_rate = 1e-8,
        sampling = present_samples, random_seed = 315)

#load in tree sequence
ts <- ts_load(model)

#add mutations to simulation
ts <- ts_mutate(ts, mutation_rate = 1e-8, random_seed = 3141)

#write out vcf
ts_vcf(ts, path = "~/Downloads/slim.examples/output.8.vcf.gz")
#read in vcf
vcfR <- read.vcfR("~/Downloads/slim.examples/output.8.vcf.gz")
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 5
##   header_line: 6
##   variant count: 78467
##   column count: 41
## 
Meta line 5 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 78467
##   Character matrix gt cols: 41
##   skip: 0
##   nrows: 78467
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant 23000
Processed variant 24000
Processed variant 25000
Processed variant 26000
Processed variant 27000
Processed variant 28000
Processed variant 29000
Processed variant 30000
Processed variant 31000
Processed variant 32000
Processed variant 33000
Processed variant 34000
Processed variant 35000
Processed variant 36000
Processed variant 37000
Processed variant 38000
Processed variant 39000
Processed variant 40000
Processed variant 41000
Processed variant 42000
Processed variant 43000
Processed variant 44000
Processed variant 45000
Processed variant 46000
Processed variant 47000
Processed variant 48000
Processed variant 49000
Processed variant 50000
Processed variant 51000
Processed variant 52000
Processed variant 53000
Processed variant 54000
Processed variant 55000
Processed variant 56000
Processed variant 57000
Processed variant 58000
Processed variant 59000
Processed variant 60000
Processed variant 61000
Processed variant 62000
Processed variant 63000
Processed variant 64000
Processed variant 65000
Processed variant 66000
Processed variant 67000
Processed variant 68000
Processed variant 69000
Processed variant 70000
Processed variant 71000
Processed variant 72000
Processed variant 73000
Processed variant 74000
Processed variant 75000
Processed variant 76000
Processed variant 77000
Processed variant 78000
Processed variant: 78467
## All variants processed
#convert to genlight
gen<-vcfR2genlight(vcfR)
## Warning in vcfR2genlight(vcfR): Found 235 loci with more than two alleles.
## Objects of class genlight only support loci with two alleles.
## 235 loci will be omitted from the genlight object.
#
pop(gen)<-gen@ind.names
sample.div <- stamppNeisD(gen, pop = FALSE)
plot(nj(sample.div), type = "unrooted", cex = 1)

ts_fst(ts, sample.sets)
## # A tibble: 3 × 3
##   x     y        Fst
##   <chr> <chr>  <dbl>
## 1 mac   ocai  0.0409
## 2 mac   hyb   0.0196
## 3 ocai  hyb   0.0160
#calculate heterozygosity per sample
mat<-extract.gt(vcfR)
sample.info$sim.het<-c(rep("hyb", times=8),rep("mac", times=16),rep("ocai", times=8))
sample.info$sim.pop<-c(rep("hyb", times=8),rep("mac", times=16),rep("ocai", times=8))
#write loop to calculate heterozygosity
for (i in 1:ncol(mat)){
  sample.info$sim.het[i]<-(sum(mat[,i][!is.na(mat[,i])] == "0|1")+sum(mat[,i][!is.na(mat[,i])] == "1|0"))/sum(!is.na(mat[,i]))
}
sample.info$sim.het<-as.numeric(as.character(sample.info$sim.het))
#plot heterozygosity violin plots
ggplot(sample.info, aes(x=sim.pop, y=sim.het)) + 
  #geom_violin(trim = FALSE)+
  geom_dotplot(binaxis='y', stackdir='center', dotsize = 2, alpha=.75,aes(fill=sample.loc))+
  theme_classic()+
  labs(x="sampling locality",y="heterozygosity")
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

Low migration followed by recent/ongoing high migration can generally recapitulate our empirical data if hybridization is bidirectional going into and out of the hybrid zone.

sim ninth model

#define each population
mac <- population("mac", time = 1, N = 25000)
ocai <- population("ocai", time = 50, N = 25000, parent = mac)
hyb <- population("hyb", time = 5000, N = 25000, parent = ocai)

#add in the gene flow edge
#add in the gene flow edge
gf <- gene_flow(from = ocai, to = hyb, start = 5001, end = 15000, 0.8)
gf2 <- gene_flow(from = mac, to = hyb, start = 5001, end = 15000, 0.8)
gf3 <- gene_flow(from = hyb, to = mac, start = 5001, end = 15000, 0.8)
gf4 <- gene_flow(from = hyb, to = ocai, start =5001, end = 15000, 0.8)
gf.x <- gene_flow(from = ocai, to = hyb, start = 15001, end = 20000, 0.2)
gf2.x <- gene_flow(from = mac, to = hyb, start = 15001, end = 20000, 0.2)
gf3.x <- gene_flow(from = hyb, to = mac, start = 15001, end = 20000, 0.2)
gf4.x <- gene_flow(from = hyb, to = ocai, start =15001, end = 20000, 0.2)

#compile the model
model <- compile_model(populations = list(mac,ocai,hyb),
                       gene_flow = list(gf,gf2,gf3,gf4,gf.x,gf2.x,gf3.x,gf4.x),
                       generation_time = 1,
                       sim_length = 20000,
                       path = "~/Downloads/slim.examples/tow9",
                       overwrite = TRUE, force = TRUE)

#plot the model to make sure you set it up correctly
plot_model(model, sizes = TRUE, proportions = TRUE)

#schedule sampling
present_samples <- schedule_sampling(model, times = 20000, list(mac, 16), list(hyb, 8), list(ocai, 8), strict = TRUE)
present_samples
## # A tibble: 3 × 7
##    time pop       n y_orig x_orig y     x    
##   <int> <chr> <int> <lgl>  <lgl>  <lgl> <lgl>
## 1 20000 mac      16 NA     NA     NA    NA   
## 2 20000 hyb       8 NA     NA     NA    NA   
## 3 20000 ocai      8 NA     NA     NA    NA
#sim the model in msprime
msprime(model, sequence_length = 1e7, recombination_rate = 1e-8,
        sampling = present_samples, random_seed = 315)

#load in tree sequence
ts <- ts_load(model)

#add mutations to simulation
ts <- ts_mutate(ts, mutation_rate = 1e-8, random_seed = 3141)

#write out vcf
ts_vcf(ts, path = "~/Downloads/slim.examples/output.9.vcf.gz")
#read in vcf
vcfR <- read.vcfR("~/Downloads/slim.examples/output.9.vcf.gz")
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 5
##   header_line: 6
##   variant count: 77888
##   column count: 41
## 
Meta line 5 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 77888
##   Character matrix gt cols: 41
##   skip: 0
##   nrows: 77888
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant 23000
Processed variant 24000
Processed variant 25000
Processed variant 26000
Processed variant 27000
Processed variant 28000
Processed variant 29000
Processed variant 30000
Processed variant 31000
Processed variant 32000
Processed variant 33000
Processed variant 34000
Processed variant 35000
Processed variant 36000
Processed variant 37000
Processed variant 38000
Processed variant 39000
Processed variant 40000
Processed variant 41000
Processed variant 42000
Processed variant 43000
Processed variant 44000
Processed variant 45000
Processed variant 46000
Processed variant 47000
Processed variant 48000
Processed variant 49000
Processed variant 50000
Processed variant 51000
Processed variant 52000
Processed variant 53000
Processed variant 54000
Processed variant 55000
Processed variant 56000
Processed variant 57000
Processed variant 58000
Processed variant 59000
Processed variant 60000
Processed variant 61000
Processed variant 62000
Processed variant 63000
Processed variant 64000
Processed variant 65000
Processed variant 66000
Processed variant 67000
Processed variant 68000
Processed variant 69000
Processed variant 70000
Processed variant 71000
Processed variant 72000
Processed variant 73000
Processed variant 74000
Processed variant 75000
Processed variant 76000
Processed variant 77000
Processed variant: 77888
## All variants processed
#convert to genlight
gen<-vcfR2genlight(vcfR)
## Warning in vcfR2genlight(vcfR): Found 214 loci with more than two alleles.
## Objects of class genlight only support loci with two alleles.
## 214 loci will be omitted from the genlight object.
#
pop(gen)<-gen@ind.names
sample.div <- stamppNeisD(gen, pop = FALSE)
plot(nj(sample.div), type = "unrooted", cex = 1)

ts_fst(ts, sample.sets)
## # A tibble: 3 × 3
##   x     y        Fst
##   <chr> <chr>  <dbl>
## 1 mac   ocai  0.0733
## 2 mac   hyb   0.0469
## 3 ocai  hyb   0.0382
#calculate heterozygosity per sample
mat<-extract.gt(vcfR)
sample.info$sim.het<-c(rep("hyb", times=8),rep("mac", times=16),rep("ocai", times=8))
sample.info$sim.pop<-c(rep("hyb", times=8),rep("mac", times=16),rep("ocai", times=8))
#write loop to calculate heterozygosity
for (i in 1:ncol(mat)){
  sample.info$sim.het[i]<-(sum(mat[,i][!is.na(mat[,i])] == "0|1")+sum(mat[,i][!is.na(mat[,i])] == "1|0"))/sum(!is.na(mat[,i]))
}
sample.info$sim.het<-as.numeric(as.character(sample.info$sim.het))
#plot heterozygosity violin plots
ggplot(sample.info, aes(x=sim.pop, y=sim.het)) + 
  #geom_violin(trim = FALSE)+
  geom_dotplot(binaxis='y', stackdir='center', dotsize = 2, alpha=.75,aes(fill=sample.loc))+
  theme_classic()+
  labs(x="sampling locality",y="heterozygosity")
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

High migration followed by low migration creates a nearly identical pattern to low migration followed by high migration. This indicates to me that we don’t actually have great power to tell old from new hybridization. Now lets try simpler models with just one big, recent/ongoing migration pulse.

sim tenth model

#define each population
mac <- population("mac", time = 1, N = 25000)
ocai <- population("ocai", time = 50, N = 25000, parent = mac)
hyb <- population("hyb", time = 15000, N = 25000, parent = mac)

#add in the gene flow edge
#add in the gene flow edge
gf.x <- gene_flow(from = ocai, to = hyb, start = 15001, end = 20000, 0.8)
gf2.x <- gene_flow(from = mac, to = hyb, start = 15001, end = 20000, 0.8)
gf3.x <- gene_flow(from = hyb, to = mac, start = 15001, end = 20000, 0.8)
gf4.x <- gene_flow(from = hyb, to = ocai, start =15001, end = 20000, 0.8)

#compile the model
model <- compile_model(populations = list(mac,ocai,hyb),
                       gene_flow = list(gf.x,gf2.x,gf3.x,gf4.x),
                       generation_time = 1,
                       sim_length = 20000,
                       path = "~/Downloads/slim.examples/tow10",
                       overwrite = TRUE, force = TRUE)

#plot the model to make sure you set it up correctly
plot_model(model, sizes = TRUE, proportions = TRUE)

#schedule sampling
present_samples <- schedule_sampling(model, times = 20000, list(mac, 16), list(hyb, 8), list(ocai, 8), strict = TRUE)
present_samples
## # A tibble: 3 × 7
##    time pop       n y_orig x_orig y     x    
##   <int> <chr> <int> <lgl>  <lgl>  <lgl> <lgl>
## 1 20000 mac      16 NA     NA     NA    NA   
## 2 20000 hyb       8 NA     NA     NA    NA   
## 3 20000 ocai      8 NA     NA     NA    NA
#sim the model in msprime
msprime(model, sequence_length = 1e7, recombination_rate = 1e-8,
        sampling = present_samples, random_seed = 315)

#load in tree sequence
ts <- ts_load(model)

#add mutations to simulation
ts <- ts_mutate(ts, mutation_rate = 1e-8, random_seed = 3141)

#write out vcf
ts_vcf(ts, path = "~/Downloads/slim.examples/output.10.vcf.gz")
#read in vcf
vcfR <- read.vcfR("~/Downloads/slim.examples/output.10.vcf.gz")
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 5
##   header_line: 6
##   variant count: 72257
##   column count: 41
## 
Meta line 5 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 72257
##   Character matrix gt cols: 41
##   skip: 0
##   nrows: 72257
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant 23000
Processed variant 24000
Processed variant 25000
Processed variant 26000
Processed variant 27000
Processed variant 28000
Processed variant 29000
Processed variant 30000
Processed variant 31000
Processed variant 32000
Processed variant 33000
Processed variant 34000
Processed variant 35000
Processed variant 36000
Processed variant 37000
Processed variant 38000
Processed variant 39000
Processed variant 40000
Processed variant 41000
Processed variant 42000
Processed variant 43000
Processed variant 44000
Processed variant 45000
Processed variant 46000
Processed variant 47000
Processed variant 48000
Processed variant 49000
Processed variant 50000
Processed variant 51000
Processed variant 52000
Processed variant 53000
Processed variant 54000
Processed variant 55000
Processed variant 56000
Processed variant 57000
Processed variant 58000
Processed variant 59000
Processed variant 60000
Processed variant 61000
Processed variant 62000
Processed variant 63000
Processed variant 64000
Processed variant 65000
Processed variant 66000
Processed variant 67000
Processed variant 68000
Processed variant 69000
Processed variant 70000
Processed variant 71000
Processed variant 72000
Processed variant: 72257
## All variants processed
#convert to genlight
gen<-vcfR2genlight(vcfR)
## Warning in vcfR2genlight(vcfR): Found 201 loci with more than two alleles.
## Objects of class genlight only support loci with two alleles.
## 201 loci will be omitted from the genlight object.
#
pop(gen)<-gen@ind.names
sample.div <- stamppNeisD(gen, pop = FALSE)
plot(nj(sample.div), type = "unrooted", cex = 1)

ts_fst(ts, sample.sets)
## # A tibble: 3 × 3
##   x     y        Fst
##   <chr> <chr>  <dbl>
## 1 mac   ocai  0.0485
## 2 mac   hyb   0.0195
## 3 ocai  hyb   0.0210
#calculate heterozygosity per sample
mat<-extract.gt(vcfR)
sample.info$sim.het<-c(rep("hyb", times=8),rep("mac", times=16),rep("ocai", times=8))
sample.info$sim.pop<-c(rep("hyb", times=8),rep("mac", times=16),rep("ocai", times=8))
#write loop to calculate heterozygosity
for (i in 1:ncol(mat)){
  sample.info$sim.het[i]<-(sum(mat[,i][!is.na(mat[,i])] == "0|1")+sum(mat[,i][!is.na(mat[,i])] == "1|0"))/sum(!is.na(mat[,i]))
}
sample.info$sim.het<-as.numeric(as.character(sample.info$sim.het))
#plot heterozygosity violin plots
ggplot(sample.info, aes(x=sim.pop, y=sim.het)) + 
  #geom_violin(trim = FALSE)+
  geom_dotplot(binaxis='y', stackdir='center', dotsize = 2, alpha=.75,aes(fill=sample.loc))+
  theme_classic()+
  labs(x="sampling locality",y="heterozygosity")
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

The founding of a population in the center of the hybrid zone by either parental, followed by strong bidirectional migration in and out of the hybrid zone (essentially continuous gene flow across a transect with weak isolating barriers) for the past 5K generations can generate patterns extremely similar to the empirical ones we observe in phylogeny, Fst, and heterozygosity.

sim 11th model

#define each population
mac <- population("mac", time = 1, N = 25000)
ocai <- population("ocai", time = 50, N = 25000, parent = mac)
hyb <- population("hyb", time = 15000, N = 25000, parent = ocai)

#add in the gene flow edge
#add in the gene flow edge
gf.x <- gene_flow(from = ocai, to = hyb, start = 15001, end = 20000, 0.8)
gf2.x <- gene_flow(from = mac, to = hyb, start = 15001, end = 20000, 0.8)

#compile the model
model <- compile_model(populations = list(mac,ocai,hyb),
                       gene_flow = list(gf.x,gf2.x),
                       generation_time = 1,
                       sim_length = 20000,
                       path = "~/Downloads/slim.examples/tow11",
                       overwrite = TRUE, force = TRUE)

#plot the model to make sure you set it up correctly
plot_model(model, sizes = TRUE, proportions = TRUE)

#schedule sampling
present_samples <- schedule_sampling(model, times = 20000, list(mac, 16), list(hyb, 8), list(ocai, 8), strict = TRUE)
present_samples
## # A tibble: 3 × 7
##    time pop       n y_orig x_orig y     x    
##   <int> <chr> <int> <lgl>  <lgl>  <lgl> <lgl>
## 1 20000 mac      16 NA     NA     NA    NA   
## 2 20000 hyb       8 NA     NA     NA    NA   
## 3 20000 ocai      8 NA     NA     NA    NA
#sim the model in msprime
msprime(model, sequence_length = 1e7, recombination_rate = 1e-8,
        sampling = present_samples, random_seed = 315)

#load in tree sequence
ts <- ts_load(model)

#add mutations to simulation
ts <- ts_mutate(ts, mutation_rate = 1e-8, random_seed = 3141)

#write out vcf
ts_vcf(ts, path = "~/Downloads/slim.examples/output.11.vcf.gz")
#read in vcf
vcfR <- read.vcfR("~/Downloads/slim.examples/output.11.vcf.gz")
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 5
##   header_line: 6
##   variant count: 70541
##   column count: 41
## 
Meta line 5 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 70541
##   Character matrix gt cols: 41
##   skip: 0
##   nrows: 70541
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant 23000
Processed variant 24000
Processed variant 25000
Processed variant 26000
Processed variant 27000
Processed variant 28000
Processed variant 29000
Processed variant 30000
Processed variant 31000
Processed variant 32000
Processed variant 33000
Processed variant 34000
Processed variant 35000
Processed variant 36000
Processed variant 37000
Processed variant 38000
Processed variant 39000
Processed variant 40000
Processed variant 41000
Processed variant 42000
Processed variant 43000
Processed variant 44000
Processed variant 45000
Processed variant 46000
Processed variant 47000
Processed variant 48000
Processed variant 49000
Processed variant 50000
Processed variant 51000
Processed variant 52000
Processed variant 53000
Processed variant 54000
Processed variant 55000
Processed variant 56000
Processed variant 57000
Processed variant 58000
Processed variant 59000
Processed variant 60000
Processed variant 61000
Processed variant 62000
Processed variant 63000
Processed variant 64000
Processed variant 65000
Processed variant 66000
Processed variant 67000
Processed variant 68000
Processed variant 69000
Processed variant 70000
Processed variant: 70541
## All variants processed
#convert to genlight
gen<-vcfR2genlight(vcfR)
## Warning in vcfR2genlight(vcfR): Found 175 loci with more than two alleles.
## Objects of class genlight only support loci with two alleles.
## 175 loci will be omitted from the genlight object.
#
pop(gen)<-gen@ind.names
sample.div <- stamppNeisD(gen, pop = FALSE)
plot(nj(sample.div), type = "unrooted", cex = 1)

ts_fst(ts, sample.sets)
## # A tibble: 3 × 3
##   x     y        Fst
##   <chr> <chr>  <dbl>
## 1 mac   ocai  0.167 
## 2 mac   hyb   0.0691
## 3 ocai  hyb   0.0360
#calculate heterozygosity per sample
mat<-extract.gt(vcfR)
sample.info$sim.het<-c(rep("hyb", times=8),rep("mac", times=16),rep("ocai", times=8))
sample.info$sim.pop<-c(rep("hyb", times=8),rep("mac", times=16),rep("ocai", times=8))
#write loop to calculate heterozygosity
for (i in 1:ncol(mat)){
  sample.info$sim.het[i]<-(sum(mat[,i][!is.na(mat[,i])] == "0|1")+sum(mat[,i][!is.na(mat[,i])] == "1|0"))/sum(!is.na(mat[,i]))
}
sample.info$sim.het<-as.numeric(as.character(sample.info$sim.het))
#plot heterozygosity violin plots
ggplot(sample.info, aes(x=sim.pop, y=sim.het)) + 
  #geom_violin(trim = FALSE)+
  geom_dotplot(binaxis='y', stackdir='center', dotsize = 2, alpha=.75,aes(fill=sample.loc))+
  theme_classic()+
  labs(x="sampling locality",y="heterozygosity")
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

removing the bidirectional migration results in the classic pattern of elevated heterozygosity in the hybrid population. This makes sense because this is what we think of in a classic tension zone or hybrid sink. Constant immigration of parentals into the zone maintains a hybrid population with low pop size and many recent backcrosses. This is in contrast with what we see in our empirical data, which points to stable, bidirectional migration with geographic isolation preventing full lineage collapse.

sim 12th model

#define each population
mac <- population("mac", time = 1, N = 25000)
ocai <- population("ocai", time = 50, N = 25000, parent = mac)
hyb <- population("hyb", time = 15000, N = 25000, parent = mac)

#add in the gene flow edge
#add in the gene flow edge
gf.x <- gene_flow(from = ocai, to = hyb, start = 15001, end = 20000, 0.8)
gf2.x <- gene_flow(from = mac, to = hyb, start = 15001, end = 20000, 0.6)
gf3.x <- gene_flow(from = hyb, to = mac, start = 15001, end = 20000, 0.6)
gf4.x <- gene_flow(from = hyb, to = ocai, start =15001, end = 20000, 0.8)
gf <- gene_flow(from = mac, to = ocai, start =15000, end = 20000, 0.4)

#compile the model
model <- compile_model(populations = list(mac,ocai,hyb),
                       gene_flow = list(gf.x,gf2.x,gf3.x,gf4.x,gf),
                       generation_time = 1,
                       sim_length = 20000,
                       path = "~/Downloads/slim.examples/tow12",
                       overwrite = TRUE, force = TRUE)

#plot the model to make sure you set it up correctly
plot_model(model, sizes = TRUE, proportions = TRUE)

#schedule sampling
present_samples <- schedule_sampling(model, times = 20000, list(mac, 16), list(hyb, 8), list(ocai, 8), strict = TRUE)
present_samples
## # A tibble: 3 × 7
##    time pop       n y_orig x_orig y     x    
##   <int> <chr> <int> <lgl>  <lgl>  <lgl> <lgl>
## 1 20000 mac      16 NA     NA     NA    NA   
## 2 20000 hyb       8 NA     NA     NA    NA   
## 3 20000 ocai      8 NA     NA     NA    NA
#sim the model in msprime
msprime(model, sequence_length = 1e7, recombination_rate = 1e-8,
        sampling = present_samples, random_seed = 315)

#load in tree sequence
ts <- ts_load(model)

#add mutations to simulation
ts <- ts_mutate(ts, mutation_rate = 1e-8, random_seed = 567)

#write out vcf
ts_vcf(ts, path = "~/Downloads/slim.examples/output.12.vcf.gz")
#read in vcf
vcfR <- read.vcfR("~/Downloads/slim.examples/output.12.vcf.gz")
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 5
##   header_line: 6
##   variant count: 71087
##   column count: 41
## 
Meta line 5 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 71087
##   Character matrix gt cols: 41
##   skip: 0
##   nrows: 71087
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant 23000
Processed variant 24000
Processed variant 25000
Processed variant 26000
Processed variant 27000
Processed variant 28000
Processed variant 29000
Processed variant 30000
Processed variant 31000
Processed variant 32000
Processed variant 33000
Processed variant 34000
Processed variant 35000
Processed variant 36000
Processed variant 37000
Processed variant 38000
Processed variant 39000
Processed variant 40000
Processed variant 41000
Processed variant 42000
Processed variant 43000
Processed variant 44000
Processed variant 45000
Processed variant 46000
Processed variant 47000
Processed variant 48000
Processed variant 49000
Processed variant 50000
Processed variant 51000
Processed variant 52000
Processed variant 53000
Processed variant 54000
Processed variant 55000
Processed variant 56000
Processed variant 57000
Processed variant 58000
Processed variant 59000
Processed variant 60000
Processed variant 61000
Processed variant 62000
Processed variant 63000
Processed variant 64000
Processed variant 65000
Processed variant 66000
Processed variant 67000
Processed variant 68000
Processed variant 69000
Processed variant 70000
Processed variant 71000
Processed variant: 71087
## All variants processed
#convert to genlight
gen<-vcfR2genlight(vcfR)
## Warning in vcfR2genlight(vcfR): Found 199 loci with more than two alleles.
## Objects of class genlight only support loci with two alleles.
## 199 loci will be omitted from the genlight object.
#
pop(gen)<-gen@ind.names
sample.div <- stamppNeisD(gen, pop = FALSE)
plot(nj(sample.div), type = "unrooted", cex = 1)

ts_fst(ts, sample.sets)
## # A tibble: 3 × 3
##   x     y        Fst
##   <chr> <chr>  <dbl>
## 1 mac   ocai  0.0383
## 2 mac   hyb   0.0186
## 3 ocai  hyb   0.0156
#calculate heterozygosity per sample
mat<-extract.gt(vcfR)
sample.info$sim.het<-c(rep("hyb", times=8),rep("mac", times=16),rep("ocai", times=8))
sample.info$sim.pop<-c(rep("hyb", times=8),rep("mac", times=16),rep("ocai", times=8))
#write loop to calculate heterozygosity
for (i in 1:ncol(mat)){
  sample.info$sim.het[i]<-(sum(mat[,i][!is.na(mat[,i])] == "0|1")+sum(mat[,i][!is.na(mat[,i])] == "1|0"))/sum(!is.na(mat[,i]))
}
sample.info$sim.het<-as.numeric(as.character(sample.info$sim.het))
#plot heterozygosity violin plots
ggplot(sample.info, aes(x=sim.pop, y=sim.het)) + 
  #geom_violin(trim = FALSE)+
  geom_dotplot(binaxis='y', stackdir='center', dotsize = 2, alpha=.75,aes(fill=sample.loc))+
  theme_classic()+
  labs(x="sampling locality",y="heterozygosity")
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

The hybrid pop being founded by maculatus rather than ocai, plus adding an additional directional migration edge from maculatus into ocai could together explain the elevated heterozygosity of the ocai lineage. This seems evolutionarily feasible, as the maculatus lineage seems unlikely to experience long distance migrants from isolated, sedentary ocai populations in western mexico, while ocai populations could be experiencing a constant (low) level of pure maculatus migrants raining down into their range from the mountain ranges just to the north.